*---------------------------------------------------
cap clear mata
cap clear
capture log close
program drop _all
macro drop _all
version 11
set mem 700m
set mat 2000
set more off

cd "E:\REStat_MS14767_Vol96(2)\Data preparation Compustat segment"
log using "4_segmentvars.log", replace


********************************************************************************
********                                                	      **************
********  in this file we create a sample of compustat variables  **************
********                compustatvars.dta                  		**************
********************************************************************************


use "compustat_master_america.dta", clear
so ticker year
keep  ticker vantagekey name countryinc employees sales forn_sal ta year SICALL2-SICALL15 SIC4 CUSIP rdexpense mkvalq
rename CUSIP cusip

		**********************************************************
		*****                VERY IMPORTANT STEP             *****
		*****           WE KEEP ONLY AMERICAN FIRMS          *****
		*****             FOR WHICH WE HAVE SALES            *****
		**********************************************************

* sales are changed into sales_new by substracting forn_sales
replace forn_sales=0 if  forn_sales==.
gen sales_new=sales- forn_sales
count if sales_new<0
replace sales_new=. if sales_new<0
count if sales_new==.
drop if sales_new==.
label var sales_new "Sales - foreign sales"

compress
drop countryinc vantagekey sales forn_sales


********************here we merge this dataset with the segment data***********
******************and do some checks:

merge 1:m ticker year using "segmentdata.dta"


		/*
		keep if _merge ==2
		sort  ticker
		drop if ticker[_n] ==ticker[_n-1]
		keep ticker
		merge 1:m ticker using "compustatvars_1.dta"

		use "compustatvars_2.dta", clear
		keep if _merge == 1
		sort ticker 
		drop  if  SICALL3 ==.
		count if ticker[_n] != ticker[_n-1]

		use "compustatvars_2.dta", clear
		keep if _merge ==3
		sort ticker
		drop if ticker[_n] ==ticker[_n-1]
		count if  cusip !=  cusip_segm
		*/


* Here we drop the observations wich have several SIC-codes in the 
*compustat data set, but are not included in the segment data set.		
drop  if  SICALL3 !=.&_merge ==1

drop SICALL*

replace sales_segm = sales_new if _merge ==1
destring sics1, replace
replace sics1 = SIC4 if _merge ==1
replace ta_segm = ta if _merge ==1
replace rds_segm = rdexpense if _merge ==1

gen dataset = "segm"
replace dataset = "comp" if _merge ==1
replace dataset ="inboth" if _merge ==3
label var dataset "Indicates the origin of Obvervation"

***************************************************************************
* Now the cusip identifier is taken from the the segment data
* 

gen cusip9 = cusip_segm
replace cusip9 =cusip if cusip9 == ""

rename name name_comp
rename conm name_segm
gen name = name_segm
replace name = name_comp if name ==""




****************************************************************************
* It occures that the cusip Identifier is the same for different ticker from the
* compustat and the segment data. Example:
* ticker	cusip9	year	dataset
***AHN	00209C102	1986	comp
***AHNA	00209C102	1986	segm
* since the segment data is our base dataset, we keep the ticker from the Segment data: 
****************************************************************************

sort cusip9 ticker dataset
gen j = 1 if cusip9==cusip9[_n-1]& ticker !=ticker[_n-1]
egen k = max(j), by(cusip9)
gen ticker2 = ticker if k ==1 & dataset =="comp"
sort cusip9 dataset
replace ticker2 = ticker2[_n-1] if ticker2=="" & k == 1
drop if k == 1 & dataset == "comp"

drop j k

****************************************************************************
* It occures that the firms Name is the same for different ticker from the
* compustat and the segment data. Example:
*name						ticker	cusip9		year	dataset
*ADAPTIVE BROADBAND CORP.	ADAP	00650M104	1986	segm
*ADAPTIVE BROADBAND CORP.	ADAPQ	00650M10X	1986	comp
* since the segment data is our base dataset, we keep the cusip from the Segment data: 
****************************************************************************


sort name ticker dataset
gen j = 1 if name==name[_n-1]& ticker !=ticker[_n-1]
egen k = max(j), by(name)
replace ticker2 = ticker if k ==1 & dataset =="comp"
sort name dataset
replace ticker2 = ticker2[_n-1] if ticker2=="" & k == 1

drop if k == 1 & dataset == "comp"
replace ticker2 = ticker if ticker2 == ""
drop j k

* this step is yust a consistency chek.
sort year ticker2
gen g = 1 if ticker2 == ticker2[_n-1] & ticker!= ticker[_n-1]
egen x = max(g), by(ticker2)
drop if x == 1& dataset == "inboth"
drop g x ticker2



egen s = seq(), by(year ticker sics1)
count if s !=1

****************************************************************************
*if the firms reports different segments and Compustat assign them the
*same SIC code we aggregate them:
****************************************************************************

egen sales = total(sales_segm), by(year ticker sics1)
label var sales "Sales in segment, in thousands"
egen ta_new = total(ta_segm), by(year ticker sics1)
label var ta_new "Total Asset in segment, in thousands"
egen rd_new = total(rds_segm), by(year ticker sics1)
label var rd_new "R&D Expenses in segment, in thousands"
keep if s ==1
replace ta_new = . if ta_segm ==.
replace rd_new = . if rds_segm==.
drop sales_segm ta_segm rds_segm

****************************************************************************
*Firms from the segment data which only have one segment reported:
*The missing of r&d and ta can be replaced by available entries from the 
* compustat data:
****************************************************************************
  
egen single = total(s), by(year ticker)
replace rd_new = rdexpense if single ==1 & _merge == 3 & rd_new ==.
replace ta_new = ta if single ==1 & _merge == 3 & ta_new ==.
drop s single

****************************************************************************
*Since total asset und R&D are often missing in
*the segment file we produce two alternative variables (TAcorr
*and R&Dcorr) which are defined as the proportion of TA and R&D from the
*main file according to the sales figures from the segment file.
****************************************************************************

*This appraoch only makes sense for observations, where merge ==3,
* since only then we have the ta and rd information from the main file
* and the sales from the segment data:
egen  sales_tot = total(sales), by(year ticker)
gen sales_prop = sales/sales_tot if sales_tot !=0

label var sales_prop "Proportion of total sales in segment"
drop sales_tot

egen ta_sum = max(ta) if _merge ==3 & ta_new==., by(year ticker)
gen ta_corr = ta_sum*sales_prop
replace ta_corr = ta_new if ta_corr == .
drop ta_sum


egen rd_sum = max(rdexpense) if _merge ==3 & rd_new==. , by(year ticker)
gen rd_corr = rd_sum*sales_prop
replace rd_corr = rd_new if rd_corr == .
drop rd_sum ta SIC4 mkvalq rdexpense gvkey employees sales_new  name_segm cusip_segm _merge name_comp cusip
rename sics1 SIC

rename cusip9 cusip

gen SIC2=int(SIC/100)
label var SIC2 "SIC4/100"

save "compustat+segment.dta", replace

********************* we correct some variables  ****************
********************* and generate some others     ****************

so year ticker SIC

egen firmnum=group(ticker)
label var firmnum "group(ticker)"
egen ncomp=count(firmnum), by(SIC year)
label var ncomp "number of competitors per year/SIC4"
drop firmnum

********************************************************
*****           
*****           here we define market shares (our dep. var.) defining the market as the SIC4 industry
*****           
********************************************************


egen tot_sales=sum(sales), by (SIC year)
label var tot_sales "sum of sales by SIC, year"
gen MS=sales/tot_sales
label var MS "Market shares firm/year at the SIC level"

gen aa=MS*MS
egen HHI=sum(aa), by(SIC year)
label var HHI "HHI index at the SIC level"
drop aa

so ticker year SIC
save "segment_compustatvars.dta", replace

********************************************************
*****           
*****   We have to make the dataset wide such that we have 
*****	only one entry per firm/year.
*****	This becomes important when merging it to the RJY dataset
*****           
********************************************************

keep ticker  year SIC MS dataset 
egen nr_sic = seq(), by(year ticker) 
reshape wide SIC MS, i(ticker year) j(nr_sic)
sort  year ticker
count if ticker[_n] == ticker[_n-1]
*0

save "segment_wide.dta", replace

keep ticker year

save "ticker_year.dta", replace
log close
